

#########
#########
######### Pre-process Data
######### JAERE Data Repository
######### 







###
### Part I:
### Get Coefficient of Performance (COP) curves for TOP-selling and for DOE Prototypical CCHP Challenge
###
### input: NEEP data, Carrier manual (numbers keyed in manually)
### output: COP2.png plot
### 
### 
### Part II:
### Pull PRISM tmean from 2015 to 2018 and add up HDD65, HDD50.
### Calculate average COP for TOP and DOE CCHP for each cell of PRISM
### 
### input:  none
### output:WCOP_HDD_PRISM.rds, a raster brick with HDD65, average COP for each cell
### 
### 
### 
### Part III: 
### Rates process (and GPSC Georgia price data) & map utilities to spatial zip
### output: spatial_rate_database.rds
### 
### Part IV:
### Pull and process RECS
### output: RECS.csv
### 
### Part V:
### Download CPI
### input: none
### output: CPI.csv
### 
### 


# unless otherwise noted, analysis is agnostic as to package version
require(XML)
require(RCurl)
require(raster)
require(sf)
require(tigris)
require(here)
require(beepr)
require(readxl)
require(fixest)
require(data.table) # version 1.14.8
require(zoo)
require(lubridate) # version 1.9.3
require(mapview)
require(tidyverse)
require(broom)
require(httr)  # for reading GA files
require(blsAPI)
setDTthreads(percent=80)


# Set BASE to Code Repository folder
BASE = file.path('YOURPATHHERE/Code Repository')  # Point to Code Repository folder here
DATA.PROC = file.path(BASE, 'Data_Proc'); dir.create(DATA.PROC)
DATA.ARCHIVE = file.path(BASE, 'Data_Archive')
FINAL.OUTPUT = file.path(BASE, 'Final_Output'); dir.create(FINAL.OUTPUT)

blsAPIkey = # Get BLS API key https://www.bls.gov/developers/home.htm   

Mode = function(x){
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}




########
########
### Use engineering estimates to get effective COP
### 
### 
# VT says avg COP of 2.4 https://www.greenbuildingadvisor.com/question/need-to-know-mitsubishi-heat-pump-cops-at-temps-below-17-degrees-f
# MA DOER says COP requirements of: https://neep.org/sites/default/files/media-files/cold_climate_air_source_heat_pump_specification_-_version_4.0_final_1.pdf

# NEEP 2018 product listings: https://neep.org/sites/default/files/ColdClimateAir-SourceHeatPumpSpecificationProductListing-Updated5.3.18.xlsx



## COP curve for all cold-climate heat pumps from NEEP CCHP directory from 2018
# NEEPdata = 'https://neep.org/sites/default/files/ColdClimateAir-SourceHeatPumpSpecificationProductListing-Updated5.3.18.xlsx'


ND = NDRAW = read_excel(path = file.path(DATA.ARCHIVE, 'ColdClimateAir-SourceHeatPumpSpecificationProductListing-Updated5.3.18.xlsx'), skip = 6, na = c('n/A','N/A'))
NDnames = names(ND)[c(1,2,4,5,6,7, 13)]
NDnames = c(NDnames, grep(names(ND), pattern='COP at Rated Capacity 47|COP at Rated Capacity 17|COP at Max\\. Capacity 5|COP at Max\\. Capacity X|Outdoor Dry', value=T))
ND = ND[,NDnames]
names(ND) = c('Mfg','Brand','AHRI_num','AHRI_type','OutMod','InMod','Ducted','COP_47','COP_17','COP_5','X_temp','COP_X')

ND = ND %>%
  dplyr::mutate(COP_5 = as.numeric(COP_5),
                X_temp = as.integer(X_temp)) %>%
  pivot_longer(cols = contains('COP'), values_to = 'COP') %>%
  dplyr::mutate(temp = as.integer(str_extract(name,('\\d+')))) %>%
  dplyr::filter(Ducted=='Centrally Ducted') %>%
  dplyr::mutate(temp = case_when(!is.na(temp) ~ temp,
                                 !is.na(X_temp) ~ X_temp,
                                 T ~ as.integer(NA))) %>%
  dplyr::filter(!is.na(COP) & !is.na(temp)) %>%
  group_by(AHRI_num) %>%
  dplyr::mutate(reportsCold = any(temp<5))


### And this one
PROTO.DOE = as.data.table(ND %>% dplyr::filter(AHRI_num %in% c('201754323','8203122')[1])) %>%
  dplyr::mutate(COP = ifelse(temp>=5 & COP<=2.4, 2.4, COP)) %>%
  slice(c(1,3,4))  # drop the value at temp=17 to fit 2nd degree poly

PROTO.DOE.reg = feols(COP ~ I(temp>5):I(temp-5) + I(temp<5):I(5-temp), PROTO.DOE)
PROTO.DOE.reg$UB = predict(PROTO.DOE.reg, newdata = data.frame(temp = 47))



## TOP is CH14NB series - use 036 (3 ton 36000 btu) (second TOP) -- using https://www.shareddocs.com/hvac/docs/1009/Public/0D/CH14NB-07PD.pdf
TOP = data.table(temp = c(47,   17,     7,    -3),  # uses 1200 CFM, last entry, 36000BTU (3 ton) version (most relevant, proper for 1500 sqft)
                 COP = c(3.614, 2.207, 1.850, 1.485),
                 AHRI_num = 'TOP', reportsCol = FALSE)
TOP.reg = lm(COP ~ poly(temp, 2), TOP)
TOP.reg$UB = predict(TOP.reg, newdata = data.frame(temp = 47)) #--> UB is used in the jCop function to limit the efficiency above 47F




### Output ~Fig B.1, Fitted COP curves for three heat pumps

png(file.path(FINAL.OUTPUT, '2COP.png'), width = 8, height = 7, units = 'in', res = 300)
    plot(y = predict(TOP.reg, newdata = data.frame(temp = 47:-20)), x = 47:-20, col = 'darkgray', lwd=3,
         xlab = 'Temp F', ylab = 'COP', xlim = c(-20, 47), ylim = c(1,4))
    points(COP ~ temp, TOP, col = 'darkgray', pch = 3, cex = 2, lwd=2)
    lines(y = predict(PROTO.DOE.reg, newdata = data.frame(temp = 47:-20)), x = 47:-20, col = 'orange', lwd=3)
    points(COP ~ temp, PROTO.DOE, col = 'orange', pch = 4, cex = 2, lwd=2)
    legend('topleft', c('Top-Selling HP','DoE Challenge CCHP'), col = c('darkgray','orange'), lty=1, lwd=3, pch = c(3,4))
dev.off()


### Get temp data to use with COP curves ###
###


jCOP<-function(t, reg.obj){
  rawpred = predict(reg.obj, newdata = data.frame(temp = t))
  rawpred[rawpred>=reg.obj$UB] = reg.obj$UB
  rawpred[rawpred<1] = 1
  return(rawpred)
}

jTemps<-function(TU, TL){ # fixed length on 7-31-23; may need to re-output results! # 8-1 now harmonic mean (1/jCOP(ii))
  TAMP = (TU-TL)/2
  TBAR = (TU+TL)/2
  TBAR - TAMP*cos(seq(0, 2*pi, length.out=24))
}

jGetCOPfixed<-function(uTU, uTL, reg.obj){
  unlist(lapply(mapply(function(x,y){jTemps(x,y)}, x = uTU, y = uTL, SIMPLIFY=F),
                function(ii){
                  replace_na(weighted.mean(1/jCOP(ii, reg.obj), w = pmax(0, 65-ii)), 1/reg.obj$UB)
                }))}


## Using PROTO and TOP where PROTO is 201754323 and TOP is CH14N (see OneNote)
HDDyearStart = 2015
HDDyearEnd = 2019
first = TRUE
counter = 0
td = tempdir()
tf1 = tempfile(tmpdir = td)
tf2 = tempfile(tmpdir = td)

for(yr in HDDyearStart:HDDyearEnd){
  lookurl1 = paste0('https://ftp.prism.oregonstate.edu/daily/tmax/',yr,'/')
  lookurl2 = paste0('https://ftp.prism.oregonstate.edu/daily/tmin/',yr,'/')
  
  print(lookurl1)
  yrfiles1 = grep(pattern='\\.zip$', getHTMLLinks(getURL(lookurl1, verbose=F, ftp.use.epsv=T, dirlistonly=T)), value=T)
  yrfiles2 = grep(pattern='\\.zip$', getHTMLLinks(getURL(lookurl2, verbose=F, ftp.use.epsv=T, dirlistonly=T)), value=T)
  stopifnot(length(yrfiles1)==length(yrfiles2))
  
  for(fl1 in yrfiles1){
    print(fl1)
    fl2 = gsub(fl1, pattern='tmax', replacement = 'tmin')
    # get the file and unzip it
    download.file(url = paste0(lookurl1, fl1), destfile = tf1)
    download.file(url = paste0(lookurl2, fl2), destfile = tf2)
    unzip(tf1, exdir = td)
    unzip(tf2, exdir = td)
    flbil1 = gsub(pattern='\\.zip$', replace = '.bil', fl1)
    flbil2 = gsub(pattern='\\.zip$', replace = '.bil', fl2)
    
    # rasterize it
    fl_rasterMax = raster(file.path(td, flbil1))
    fl_rasterMin = raster(file.path(td, flbil2))
    
    
    if(first==T){
      COP_raster = copy(fl_rasterMax)
      values(COP_raster) = values(COP_raster)*0
      COP_rasterb = brick(list(TMEAN = COP_raster, 
                               HDD65 = COP_raster,
                               WCOP65.TOP = COP_raster,
                               WCOP65.PROTO.DOE = COP_raster))
    }
    
    procDT = data.table(max = round(32 + (9/5)*values(fl_rasterMax)),
                        min = round(32 + (9/5)*values(fl_rasterMin)))
    procDTu = unique(procDT)
    procDTu[,tmean:=(max+min)/2]
    procDTu[,c('TMEAN','HDD65',
               'avginvCOP.TOP',
               'avginvCOP.PROTO.DOE'):=list(tmean,
                                            pmax(0, 65-tmean),
                                            jGetCOPfixed(uTU = max, uTL = min, reg.obj = TOP.reg),
                                            jGetCOPfixed(uTU = max, uTL = min, reg.obj = PROTO.DOE.reg))]
    
    procDTu[,c('WCOP65.TOP','WCOP65.PROTO.DOE'):=list(avginvCOP.TOP*HDD65, avginvCOP.PROTO.DOE*HDD65)]                                         
    
    procDT = merge(x = procDT, y = procDTu, by = c('max','min'), sort = F)
    values(COP_rasterb) = values(COP_rasterb) + as.matrix(procDT[,.(TMEAN, HDD65, WCOP65.TOP, WCOP65.PROTO.DOE)])
    
    counter = counter + 1
    first = FALSE
    unlink(file.path(td, flbil1))  # delete the .bil file
    unlink(file.path(td, flbil2))
  }
}


COP_rasterb = addLayer(COP_rasterb, 
                       AWCOP65.TOP   =     1/(COP_rasterb[['WCOP65.TOP']]/COP_rasterb[['HDD65']]), 
                       AWCOP65.PROTO.DOE = 1/(COP_rasterb[['WCOP65.PROTO.DOE']]/COP_rasterb[['HDD65']]))
names(COP_rasterb) = c(names(COP_rasterb)[1:4], c('AWCOP65.TOP','AWCOP65.PROTO.DOE'))
COP_rasterb = brick(COP_rasterb)

COPl = list(COP_rasterb = COP_rasterb, counter = counter)

saveRDS(COPl, file.path(DATA.PROC, 'WCOP_HDD_PRISM.rds'))
# State_COP_and_HDD.csv is written to DATA.PROC in 2_Analysis.R






###
###
###
### Part III ####
### 
### 

# Sources (copies located in Data_Archive)
# datagov = rbindlist(list(fread('https://data.openei.org/files/446/iouzipcodes2015.csv'),
#                          fread('https://data.openei.org/files/446/noniouzipcodes2015.csv')))
# 
# EIAf = list(M2022 = 'https://www.eia.gov/electricity/data/eia861m/archive/xls/sales_ult_cust_2022.xlsx',
#             M2021 = 'https://www.eia.gov/electricity/data/eia861m/archive/xls/sales_ult_cust_2021.xlsx',
#             M2020 = 'https://www.eia.gov/electricity/data/eia861m/archive/xls/retail_sales_2020.xlsx',
#             M2019 = 'https://www.eia.gov/electricity/data/eia861m/archive/xls/retail_sales_2019.xlsx',
#             M2018 = 'https://www.eia.gov/electricity/data/eia861m/archive/xls/retail_sales_2018.xlsx',
#             M2017 = 'https://www.eia.gov/electricity/data/eia861m/archive/xls/retail_sales_2017.xlsx')

datagov = rbindlist(list(fread(file.path(DATA.ARCHIVE, 'iouzipcodes2015.csv')),
                         fread(file.path(DATA.ARCHIVE, 'noniouzipcodes2015.csv'))))


EIAf = list(M2022 = file.path(DATA.ARCHIVE, 'EIA861','sales_ult_cust_2022.xlsx'),
            M2021 = file.path(DATA.ARCHIVE, 'EIA861','sales_ult_cust_2021.xlsx'),
            M2020 = file.path(DATA.ARCHIVE, 'EIA861','retail_sales_2020.xlsx'),
            M2019 = file.path(DATA.ARCHIVE, 'EIA861','retail_sales_2019.xlsx'),
            M2018 = file.path(DATA.ARCHIVE, 'EIA861','retail_sales_2018.xlsx'),
            M2017 = file.path(DATA.ARCHIVE, 'EIA861','retail_sales_2017.xlsx'))


for(i in 1990:2016){
  EIAf[[paste0('M',i)]] = file.path(DATA.ARCHIVE, 'EIA861', paste0('f826',i,'.xls'))
}


lookup = c(new = 'old',
           Year = 'Year', Year = 'YEAR',
           Month = 'Month', Month = 'MONTH',
           eiaid='Utility Number',
           eiaid='UTILITYID',
           eiaid='UTILITY_ID',
           eia_name = 'Utility Name',
           eia_name = 'UTILNAME',
           State = 'STATE_CODE',
           State = 'STATE',
           State = 'State',
           Revenue = 'RESIDENTIAL REVENUES ($1,000)', Revenue = 'Thousands Dollars...7', Revenue = 'Thousands Dollars...8', Revenue = 'RES_REV (Thousand $)',
           MWH = 'Megawatthours...8',MWH = 'Megawatthours...9', MWH = 'RES_SALES (MWh)', MWH = 'RESIDENTIAL SALES (MWh)',MWH = 'RES_SALES (MwH)',
           Count = 'Count...10',Count = 'Count...9', Count = 'RESIDENTIAL CUSTOMERS',Count = "RESIDENTIAL_CONS",Count = "RES_CONS")

EIAfx = lapply(EIAf, function(x){
 # tf = tempfile()
 # download.file(x, destfile = tf, mode='wb')
  trx = read_excel(x, sheet=1, skip=0)
  if('YEAR' %in% names(trx)) {sk = 0} else {
    sk = which(apply(trx, MAR=1, function(p) (sum(grepl(p, pattern='YEAR|year|[Yy]ear')>0)))==1)-0
  }
  rm(trx)
  rx = read_excel(x, sheet=1, skip = sk, na = c('','.')) %>%
    dplyr::select(1:10) %>%
    dplyr::rename(any_of(lookup)) %>%
    dplyr::mutate(Year = as.integer(gsub(Year, pattern='[A-Za-z]*', replacement = '')))
  
  if(is.null(rx$Count)) rx$Count = as.numeric(NA)
  if(is.null(rx$Ownership)) rx$Ownership = as.character(NA)
  
  return(rx %>% dplyr::select(Year, Month, eiaid, eia_name, State, Ownership, Revenue, MWH, Count))
})


EIAf = bind_rows(EIAfx) %>%
  dplyr::filter(!grepl(eia_name, pattern = 'National Total|Total EPM|[Mm]eter')) %>%
  dplyr::filter(!grepl(Ownership, pattern='[Mm]eter') & !State %in% c('HI','AK')) %>%
  dplyr::filter(is.na(Count) | Count!=0) %>% 
  dplyr::group_by(eiaid) %>%
  dplyr::mutate(Ownership = Mode(Ownership),
                adjustment = grepl(pattern='[Aa]djustment', eia_name),
                u.eia = paste0(eiaid, '-', adjustment,'--', State)) %>% 
  ungroup() %>%
  dplyr::mutate(Year = ifelse(Year<1998, Year+1900, Year)) %>%
  dplyr::arrange(eiaid, Year, Month) %>%
  dplyr::mutate(ym = as.yearmon(paste0(Year, '-', Month)),
                yq = as.yearqtr(ym)) %>%
  dplyr::filter(!is.na(ym) & !is.na(yq) & !is.na(eiaid)) %>%
#  dplyr::mutate(u.eia.yq = paste0(u.eia, '---', as.character(as.numeric(yq)))) %>%
  setDT()

      
      
      #### EIA 861M monthly regressions ####
      ### For each quarter, local linear regression using +- 23 months
      ### Note: final draft does not use this for analysis as equivalent data does not exist for Georgia
      qtrs = unique(EIAf$yq[EIAf$Year>=2010])
      qtrs = qtrs[order(qtrs)]
      
      eias = unique(EIAf$u.eia)
      
      window = 23
      kernel = c(1/((window):1)^.5, 1, 1/(1:(window))^.5)
      kernel = kernel/sum(kernel)
      
      hol = list()
      
      for(e in eias){
        print(e)
        ue = EIAf[u.eia==e,][order(ym),ww := 0]
        le = list()
        for(q in qtrs){
          print(q)
          ue$ww = 0
          qm = as.yearmon(q) + 1/12
          qmindex = which(ue$ym==qm)
          if(any(qmindex + -23:23 <1)|any(qmindex + -23:23 > NROW(ue))|!qm %in% ue$ym|sum(ue$Revenue, na.rm=T)<=0) next
          ue$ww[qmindex + -23:23] = kernel
          le[paste0(e, '---', as.character(q))] = list(tidy(feols(Revenue ~ MWH, ue, weights = ~ ww, verbose = 0)))
        }
        hol[e] = list(le)
      }
      
      marg = bind_rows(lapply(hol, bind_rows, .id = 'u.eia.yq')) %>% 
        pivot_wider(id_cols = 'u.eia.yq', names_from = 'term', values_from = 'estimate') %>%
        dplyr::select(u.eia.yq, fixed_total = "(Intercept)", cpkwh = MWH)
      
      EIAf = EIAf %>%
        dplyr::mutate(u.eia.yq = paste0(u.eia, '---', as.character(as.numeric(yq)))) %>%
        left_join(marg, by='u.eia.yq') %>%
        dplyr::mutate(fixed_cost = 1000*fixed_total/Count,
                      source = 'EIA861monthly')
      
      EIAfng = copy(EIAf)



#### Import and process GA survey data ####
# https://psc.ga.gov/utilities/electric/residential-rate-survey/

      
##--> Scrape GPSC website for files
parse = htmlParse( GET('https://psc.ga.gov/utilities/electric/residential-rate-surveys/'))
links =  grep(xpathSApply(parse, path="//a", xmlGetAttr, "href"), pattern='\\.xls', value=T)

links.dl = grep(links, pattern='\\.xls', value=T)
links.dl = c(grep(links.dl, pattern='[Aa]llproviders[Ww]', value=T, ignore.case=T),
             grep(links.dl, pattern='[Aa]llproviders[Ss]', value=T, ignore.case=T))

# 2011, 2012 is on separate sheets. Not today
links.dl = grep(links.dl, pattern='2010|2011|2012', invert=T, value=T)


jReg = function(DD){
  res = tidy(feols(value ~ 1 + kwh, data = DD))
  list(fixed_cost = res$estimate[1],
       cpkwh = res$estimate[2])
}

jReadGA<-function(x){
  cat('\n'); cat(x)
  GET(x, write_disk(tf <- tempfile(fileext = ".xls")))
  df <- read_excel(tf, skip = 0, col_names = F)
  skips = min(which(apply(df, MAR=1, FUN = function(x) sum(grepl(pattern='Charges', x)))>0))[1]
  cols.to.keep = which(apply(df, MAR=2, FUN = function(x) sum(grepl(pattern='Provider|Charges', x))>0))
  
  season = ifelse(grepl(df[,cols.to.keep[1]], pattern='[Ww]inter'), 'Winter','Summer')
  yr = as.integer(str_extract(df[,cols.to.keep[1], drop=T], pattern='(20\\d\\d)'))
  yr = yr[!is.na(yr)][1]
  
  dff = read_excel(tf, skip = skips, col_names = F)[,cols.to.keep]
  colnames(dff) = c('Provider',paste0('Charge_', c(500,1000,1500,2000)))
  dff$season = season
  dff$yr = as.character(yr)
  
  dff = dff %>%
    pivot_longer(cols = starts_with('Charge'), values_transform = list(value = as.numeric)) %>%
    dplyr::filter(!is.na(Provider) & Provider !='AVERAGE') %>%
    dplyr::mutate(kwh = as.integer(sapply(str_split(name, '_'), '[', 2))) %>%
    setDT()
  
  dff = dff[,jReg(.SD), by = .(Provider, season, yr)]
  return(dff)
}

gal = lapply(links.dl,jReadGA)
gal = rbindlist(gal) 


ugal = unique(gal$Provider)  # GA utilities from state
udg = unique(datagov[state=='GA',.(utility_name, eiaid)])  # GA utilities from datagov (EIA 2015) used to match zip to utility
udg[,utility_name:=gsub(pattern='^\\s+|\\s+$', replacement = '', gsub(pattern='\\(GA\\)|-', replacement = '', utility_name))]


ugal.most.common = paste(ugal, collapse = ' ')
tabs = as.data.frame(sort(table(unlist(strsplit(ugal.most.common, " |\\(|\\)"))),      # Create frequency table
                          decreasing = TRUE))
ugal = data.table(Provider = ugal)
ugal[,c('least1','least2'):=rbindlist(lapply(str_split(ugal$Provider, pattern=' |\\(|\\)'),
                                             function(n){
                                               listing = tabs$Freq[match(n, tabs$Var1)] 
                                               n = n[order(listing)]
                                               data.table(least1 = n[1],
                                                          least2 = n[2])
                                             }))]

udg.most.common = paste(udg$utility_name, collapse = ' ')
tabs = as.data.frame(sort(table(unlist(strsplit(udg.most.common, ',| |\\(|\\)|-'))),      # Create frequency table
                          decreasing = TRUE))

udg[,c('least1','least2'):=rbindlist(lapply(str_split(udg$utility_name, pattern=',| |\\(|\\)|-'),
                                            function(n){
                                              listing = tabs$Freq[match(n, tabs$Var1)] 
                                              n = n[order(listing)]
                                              data.table(least1 = n[1],
                                                         least2 = n[2])
                                            }))]


ugal$match = match(ugal$least1, udg$least1)   
gal.match = cbind(ugal, udg[ugal$match,])
gal.match = unique(gal.match[,.(Provider, eiaid, utility_name)])

gal = merge(x = gal, y = gal.match[,.(Provider, eiaid)],
            by='Provider',
            all.x = T,
            all.y = F)
# add in months (to match EIAf)
gal[,ym:=as.yearmon(mdy(paste0(ifelse(season=='Winter', '03-01-', '09-01-'), yr)))]
gal[,yml:=list(list(rbind(.SD[,.(ym)], .SD[,.(ym)]-(1/12),
                          .SD[,.(ym)]-(2/12), .SD[,.(ym)]-(3/12),
                          .SD[,.(ym)]-(4/12), .SD[,.(ym)]-(5/12)))), by= .(Provider, fixed_cost, cpkwh, eiaid, ym)]
gal = gal[,rbindlist(yml), by = .(Provider, fixed_cost, cpkwh, eiaid)]
gal = gal[order(Provider, eiaid, ym),.(eiaid, Provider, ym, fixed_cost, cpkwh)]

## Georgia power co is the only investor owned
gal[,Ownership:=case_when(grepl(Provider, pattern='Georgia Power') ~ 'Investor Owned',
                          grepl(Provider, pattern='EMC|Coop') ~ 'Cooperative',
                          grepl(Provider, pattern='City of') ~ 'Municipal',
                          TRUE ~ 'Municipal')]
gal[,c('State','adjustment',
       'u.eia','u.eia.yq' ,
       'yq','Year',
       'Month','eia_name',
       'source'):=list('GA',
                       FALSE,
                       paste0(eiaid, '-', 'FALSE','--GA'),
                       paste0(eiaid,'-FALSE--GA---',as.numeric(as.yearqtr(ym))),
                       as.yearqtr(ym),
                       lubridate::year(ym),
                       lubridate::month(ym),
                       Provider,
                       'GA-MPSC')]

# Jackson is supposed to be the EMC Jackson
gal = gal[Provider!='Jackson Electric (City of)',]
# Washington seems to be City of Washington
gal = gal[Provider!='Washington EMC',]


ga.EIAf = unique(gal$eiaid)[!is.na(unique(gal$eiaid))]
gal.EIAf = gal[!is.na(eiaid),.(Year, Month, eiaid, eia_name, State, Ownership, adjustment, u.eia, ym, yq, u.eia.yq, cpkwh, fixed_cost, source )]


## Replace the gal.EIAf observations in EIAf

EIAf = rbindlist(list(EIAf[!eiaid %in% gal.EIAf$eiaid,],
                      gal.EIAf), fill=T)


# EIAfng was copied before this. It is without-GPSC data
# EIAf is with-GPSC data (replacing those areas that are in GPSC data)
# Both will get saved





##### HIFLD to EIA + USURDB Crosswalk ####
##### using datagov (IOU and non-IOU average rate has zip-eiaid) https://data.openei.org/submissions/5650
##### But may be based on county reported in 861? --> doesn't look like it as resulting territories don't follow county boundaries
fzip = readRDS(file.path(DATA.ARCHIVE, 'USA_Zip_Code_Boundaries_v104_zip_poly.rds')) %>%
  st_as_sf()


## In datagov, we need to get to one eiaid per zip, collapsed by largest server for that zip, which means I need EIAf$Count from non-monthly
## Need 861 data for all. Using 2018:
tf = tempfile(); td = tempdir()
download.file('https://www.eia.gov/electricity/data/eia861/archive/zip/f8612018.zip', destfile = tf, mode = 'wb')
unzip(tf, list = F, exdir = td)
EIAys = read_xlsx(file.path(td, 'Sales_Ult_Cust_2018.xlsx'), skip = 2, na = '.') %>%
  dplyr::select(eiaid = `Utility Number`, utilityName = `Utility Name`, state = State, 
                Count = `Count...12`, serviceType.EIA = `Service Type`, ownership.EIA861 = `Ownership`, Part) %>%
  dplyr::filter(Part!='C') %>% dplyr::select(-Part) %>%
  group_by(eiaid, state,utilityName, ownership.EIA861) %>%
  dplyr::summarize(Count = sum(as.numeric(Count), na.rm=T)) %>%
  ungroup()

EIAysh = read_xlsx(file.path(td, 'Short_Form_2018.xlsx'), skip = 0, na = '.') %>%
  dplyr::select(eiaid = `Utility Number`, utilityName = `Utility Name`, state = State, Count = `Total Customers`, ownership.EIA861 = `Ownership`) %>%
  group_by(eiaid, state, utilityName, ownership.EIA861) %>%
  dplyr::summarize(Count = sum(as.numeric(Count), na.rm=T)) %>%
  ungroup()

EIAys = bind_rows(EIAys, EIAysh) %>% arrange(eiaid, state, utilityName)
unlink(tf)
unlink(td)

datagov.merge = merge(x = datagov, y = EIAys %>% dplyr::select(eiaid, state, Count, ownership.EIA861), by = c('eiaid','state'), all.x = T, all.y = F) # doesn't have TERRITORY for CA but could add around here.
datagov.merge[is.na(Count), Count:=0]

datagov.merge[,c('allNames','numUtilities'):=list(paste(unique(utility_name[order(-Count)][Count>0]), collapse = ';   '),
                                                  length(unique(eiaid[Count>0]))), by='zip']
datagov.merge = datagov.merge[,j=.SD[which.max(Count)[1],], by='zip', .SDcols = c('eiaid','utility_name','ownership','Count','numUtilities','allNames')]



fzip = left_join(fzip, datagov.merge, by='zip')



### Some missing zipcodes. Assign to nearest utility
fzip.good = fzip %>% dplyr::filter(!is.na(eiaid))
fzip.bad =  fzip %>% dplyr::filter(is.na(eiaid)) 


good.index = st_nearest_feature(fzip.bad %>% st_centroid(of_largest_polygon=T), fzip.good %>% st_centroid(of_largest_polygon=T))
fzip.bad = fzip.bad %>%
  dplyr::mutate(eiaid = fzip.good$eiaid[good.index],
                utility_name = fzip.good$utility_name[good.index],
                Count = fzip.good$Count[good.index],
                numUtilities = 0,
                allNames = '',
                inUSURDB = fzip.good$inUSURDB[good.index])

fzip = bind_rows(fzip.bad, fzip.good)

#### fzip has the spatial utility data at the zip level
#### EIAf/EIAfng has another EIAID-level price index (both average price per kwh and the regression-based fixed/marginal decomp)
#### 

saveRDS(list(fzip=fzip, EIAfng = EIAfng, EIAf = EIAf), file.path(DATA.PROC, 'Spatial_Rate_Database.rds'))










#### Part IV: Pull and Process2009/2015/2020 RECS ####
#### Used for RECS analsyis in Section 2-3. RECS is re-compiled for Section 4.


# RECS09 = fread('https://www.eia.gov/consumption/residential/data/2009/csv/recs2009_public.csv')[,IECC_CLIMATE_PUB:=IECC_Climate_Pub][,UATYP10:=UR]
RECS09 = fread(file.path(DATA.ARCHIVE, 'RECS','recs2009_public.csv'))[,IECC_CLIMATE_PUB:=IECC_Climate_Pub][,UATYP10:=UR]

# RECS15 = fread('https://www.eia.gov/consumption/residential/data/2015/csv/recs2015_public_v4.csv')
RECS15 = fread(file.path(DATA.ARCHIVE, 'RECS','recs2015_public_v4.csv'))

# RECS20 = fread('https://www.eia.gov/consumption/residential/data/2020/csv/recs2020_public_v4.csv')[,IECC_CLIMATE_PUB:=IECC_climate_code][,HDD30YR:=HDD30YR_PUB]
RECS20 = fread(file.path(DATA.ARCHIVE, 'RECS','recs2020_public_v4.csv'))[,HDD30YR:=HDD30YR_PUB]



DICT = file.path(DATA.ARCHIVE, 'RECS','Documentation')


dict09 = readxl::read_excel(file.path(DICT, 'RECS09','recs2009_public_codebook.xlsx'), skip = 2, range = 'A3:D943') %>%
  dplyr::select(c(1:4)) %>%
  setNames(., c('VARNAME','DESC09','CODES09','LABELS09')) 

dict15 = readxl::read_excel(file.path(DICT, 'RECS15','codebook_public_RECS15.xlsx'), skip = 3) %>%
  dplyr::select(c(1,4,5,6)) %>%
  setNames(., c('VARNAME','DESC15','CODES15','LABELS15')) %>%
  dplyr::filter(!grepl(pattern='IECC', DESC15))

 
dict20 = readxl::read_excel(file.path(DICT, 'RECS20','RECS 2020 Codebook for Public File - v4.xlsx'), skip = 1) %>%
  dplyr::select(c(1,3,4)) %>%
  dplyr::mutate(LABELS20 = NA) %>%
  setNames(., c('VARNAME','DESC20','CODES20','LABELS20')) 



useID = c('DOEID','REGIONC','DIVISION','REPORTABLE_DOMAIN','state_postal','CLIMATE_REGION_PUB',
          'Climate_Region_Pub','BA_climate','IECC_CLIMATE_PUB',# 'IECC_Climate_Code','IECC_CLIMATE_PUB','IECC_climate_code',
          'METROMICRO','UATYP10','UR','NWEIGHT')

useHH = c('TYPEHUQ','BEDROOMS','STORIES',paste0('ENERGYASST',11:20),'HELPHT',
          'KOWNRENT','ELPAY','LPGPAY','NGPAY', 'YEARMADERANGE',
          'YEARMADE','MONEYPY','HHAGE','TOTROOMS','NCOMBATH','NHAFBATH','NHSLDMEM','SDESCENT',
          'TOTHSQFT','TOTHSQFT_EN','TOTSQFT','TOTSQFT_EN','ADQINSUL','ALTFUELPEV','ATHOME','BACKUP','BASEFIN','BASEHEAT',
          'TOTALBTUSPH')

useWeather = c('CDD65','HDD65','CD65','HD65','HDD30YR','TEMPGONE','TEMPHOME','TEMPGONEAC','TEMPHOMEAC','TEMPNITE','TEMPNITEAC','SMARTMETER')

useHVAC = c('AGECENAC','AIRCOND','ACEQUIPM_pub','ACEQUIPAUXTYPE_pub','ACEQUIPAGE','AUDIT','CENACHP','COOLTYPE','DNTHEAT','ELCOOL','ELWARM',
            'EQUIPAGE','EQUIPAUX','EQUIPM','FOPAY','FOWARM','AUDIT','AGEAUD','FUELAUX','FUELH20','FUELHEAT','HEATHOME',
            'LPWRM','LPWATER','SOLWATER','USECENAC','USELP','USENG','USESOLAR','USEWWAC','WWACAGE','FOWATER')

useHVACSpec = c('REVERSE','WARMAIR','FURNFUEL','STEAMR','RADFUEL','PERMELEC','PIPELESS','PIPEFUEL','ROOMHEAT','ROOMHEATFUEL',
                'WOODKILN','HSFUEL','CARRYEL','CARRYKER','CHIMNEY','FPFUEL','NGFPFLUE','USENGFP','RANGE',
                'ELECAUX','UGASAUX','LPGAUX','FOILAUX','KEROAUX','WOODAUX') # how do these differ from WOODKILN....RANGE??

useCons = c('BTUFO','BTULP','CUFEETNG','DOLLAREL','DOLLARFO','DOLLARLP','DOLLARNG','GALLONLP','KWH')

useAll = unique(c(useID, useHH, useWeather, useHVAC, useHVACSpec, useCons))


keepUse<-function(data, useAll){
  nmu = intersect(names(data), useAll)
  nmu = nmu[na.omit(match(useAll, nmu))]
  data[,nmu, with = F]
}

# RECS05 = keepUse(RECS05, useAll)
RECS09 = keepUse(RECS09, useAll)
RECS15 = keepUse(RECS15, useAll)
RECS20 = keepUse(RECS20, useAll)

RECS = list( RECS09 = RECS09, RECS15 = RECS15, RECS20 = RECS20)

tableVar = function(x, var){
  if(var %in% names(x)){
    table(x[,var, with = F])
  } else {
    paste(var, 'not present')
  }
}


### Set variable dictionaries

handleDict<-function(xu, keycol = NULL, valuecol = NULL, namecol = NULL){ # x = dict09[3,]
  cat('New Dictionary'); cat('\n\n\n\n')
  ll = split(xu, 1:NROW(xu))
  ll = lapply(ll, function(x){
    varname = as.character(x[namecol])
    cat(varname); cat('\n')
    key = as.character(x[keycol])
    val = as.character(x[valuecol])
    
    is.multi.key = grepl('\\n|\\r', key)
    is.multi.val = grepl('\\n|\\r', val)
    # stopifnot(is.multi.key==is.multi.val)
    
    if(is.multi.key==FALSE|is.multi.val==FALSE) return(NULL)  # what to do for those that don't have matches?
    
    if( all(as.character(read_lines(val)) %in% c('Yes','No','Not Applicable','Not applicable'))){
      dict = data.frame(key = as.integer(read_lines(key)),
                        value = case_when(as.integer(read_lines(key))==1 ~ TRUE,
                                          as.integer(read_lines(key))==0 ~ FALSE,
                                          as.integer(read_lines(key))==-2 ~ NA))
    } else {
      if( all(as.character(read_lines(key)) %in% c('U','R','C','Urban','Rural'))) {
        dict = data.frame(key = as.character(read_lines(key)),
                          value = as.character(read_lines(val))[as.character(read_lines(val)) !=''])
      } else {
        dict = data.frame(key = as.integer(read_lines(key)),
                          value = as.character(read_lines(val))[as.character(read_lines(val)) !=''])
      }
    }
    return(dict)})
  names(ll) = xu$VARNAME
  return(ll)
}

# keep only those dictionary variables that are used:
dict = lapply(list(dict09 = dict09,
                   dict15 = dict15), function(x) x %>% dplyr::filter(x$VARNAME %in% useAll))

dict = lapply(dict, handleDict, keycol = 3, valuecol = 4, namecol = 1)
#--> dict holds mappings that might be appropriate for >1 variable (if re-use values)


dict20 = list(
  REGIONC = data.frame(key = c('MIDWEAST','NORTHEAST','SOUTH','WEST'),
                       value = c('Midwest','Northeast','South','West')),
  DIVISION = NULL, # Already in the right values
  state_postal = NULL,
  BA_climate = NULL,
  IECC_CLIMATE_PUB = NULL,
  UATYP10 = data.frame(key = c('C','R','U'),
                       value = c('Urban cluster','Rural area','Urban area')),
  NWEIGHT = NULL,
  TYPEHUQ = dict[[2]]$TYPEHUQ,  # matches that one
  STORIES = data.frame(key = c(1,2,3,4,5,-2),
                       value = c('One story','Two stories','Three stories','Four or more stories','Split-level',NA)),
  ENERGYASST16 = dict[[2]]$ENERGYASST11,
  ENERGYASST17 = dict[[2]]$ENERGYASST11,
  ENERGYASST18 = dict[[2]]$ENERGYASST11,
  ENERGYASST19 = dict[[2]]$ENERGYASST11,
  ENERGYASST20 = dict[[2]]$ENERGYASST11,
  KOWNRENT = dict[[2]]$KOWNRENT,
  ELPAY = data.frame(key = c(1L, 2L, 3L, 99L), 
                     value = c("Household is responsible for paying for all used in this home", 
                               "All used in this home is included in the rent or condo fee",
                               "Some is paid by the household, some is included in the rent or condo fee",
                               "Paid for some other way")),
  LPGPAY = data.frame(key = c(1L, 2L, 3L, 99L), 
                      value = c("Household is responsible for paying for all used in this home", 
                                "All used in this home is included in the rent or condo fee",
                                "Some is paid by the household, some is included in the rent or condo fee",
                                "Paid for some other way")),
  NGPAY = data.frame(key = c(1L, 2L, 3L, 99L), 
                     value = c("Household is responsible for paying for all used in this home", 
                               "All used in this home is included in the rent or condo fee",
                               "Some is paid by the household, some is included in the rent or condo fee",
                               "Paid for some other way")),
  YEARMADERANGE = data.frame(key = 1:9, 
                             value = c("Before 1950", "1950 to 1959", 
                                       "1960 to 1969", "1970 to 1979", "1980 to 1989", "1990 to 1999", 
                                       "2000 to 2009", "2010 to 2015","2016 to 2020")),
  MONEYPY = data.frame(key = 1:16,
                       value = c(2500, mean(c(5000,7500)),
                                 mean(c(7500,10000)), 11125, 13750,
                                 17750, 22500, 27500, 32500,37500,
                                 45000, 55000, mean(c(60000, 75000)),
                                 87500, 125000, 175000)),
  HHAGE = NULL,
  TOTROOMS = NULL,
  SDESCENT = dict[[2]]$ENERGYASST11, # yes  no na
  ADQINSUL = dict[[2]]$ADQINSUL,
  BACKUP = dict[[2]]$ENERGYASST11,
  BASEFIN = dict[[2]]$ENERGYASST11,
  BASEHEAT = dict[[2]]$ENERGYASST11,
  SMARTMETER = data.frame(key = c(1, 0, -4), 
                          value = c("Yes","No","Don't know")),
  AIRCOND = dict[[2]]$ENERGYASST11,
  ACEQUIPM_pub = data.frame(key = c(1,3,4,5,6,-2),
                            value = c('Central AC including central HP','Ductless HP','Window AC','Portable AC','Swamp cooler',NA)),
  ACEQUIPAUXTYPE_pub = data.frame(key = c(0,1,3,4,5,6,-2),
                                  value = c('No other equipment','Central AC including central HP','Ductless HP','Window AC','Portable AC','Swamp cooler',NA)),
  ACEQUIPAGE = data.frame(key = c(1,2,3,4,5,6,-2),
                          value = c('Less than 2 years old','2 to 4 years old','5 to 9 years old','10 to 14 years old','15 to 19 years old','20 or more years old',NA)),
  ATHOME = data.frame(key = c(0,1,2,3,4,5),
                      value = c(0,1,2,3,4,5)), # # of days home all or most of day
  DNTHEAT = data.frame(key = c(1,2,-2),
                       value = c('Have space heating equipment, but do not use it','Do not have any space heating equipment',NA)),
  ELCOOL = dict[[2]]$ENERGYASST11,
  ELWARM = dict[[2]]$ENERGYASST11,
  EQUIPAGE = data.frame(key = c(1,2,3,4,5,6,-2),
                        value = c('Less than 2 years old','2 to 4 years old','5 to 9 years old','10 to 14 years old','15 to 19 years old','20 years old',NA)),
  EQUIPAUX = dict[[2]]$EQUIPAUX, # EQUIPAUXTYPE and USEEQUIPAUX are...not used?
  EQUIPM = data.frame(key = c(3,2,4,13,5,7,8,10,99,-2),
                      value = c('Central furnace','Steam/hot water system with radiators or pipes','Central Heat pump','Ductless heat pump',
                                'Built-in electric units installed in walls, ceilings, baseboards, or floors',
                                'Built-in room heater burning gas or oil','Wood-burning stove (cordwood or pellets)','Portable electric heaters',
                                'Some other equipment',NA)),
  FOPAY = data.frame(key = c(1L, 2L, 3L, 99L, -2L), 
                     value = c("Household is responsible for paying for all used in this home", 
                               "All used in this home is included in the rent or condo fee", 
                               "Some is paid by the household, some is included in the rent or condo fee", 
                               "Paid for some other way", "Not applicable")),
  FOWARM = dict[[2]]$ENERGYASST11,
  FUELAUX = data.frame(key = c(1L, 2L, 3L, 5L, 7L, 99L, -2L), 
                       value = c("Natural gas from underground pipes", 
                                 "Propane (bottled gas)", "Fuel oil/kerosene", "Electricity", 
                                 "Wood (cordwood or pellets)", "Some other fuel", "Not applicable")),
  FUELHEAT = data.frame(key = c(1L, 2L, 3L, 5L, 7L, 99L, -2L), 
                        value = c("Natural gas from underground pipes", 
                                  "Propane (bottled gas)", "Fuel oil/kerosene", "Electricity", 
                                  "Wood (cordwood or pellets)", "Some other fuel", "Not applicable")),
  HEATHOME = dict[[2]]$ENERGYASST11,
  LPWATER = dict[[2]]$ENERGYASST11,
  SOLWATER = dict[[2]]$ENERGYASST11,
  USELP = dict[[2]]$ENERGYASST11,
  USENG = dict[[2]]$ENERGYASST11,
  USESOLAR = dict[[2]]$ENERGYASST11,
  FOWATER = dict[[2]]$ENERGYASST11,
  RANGE = NULL)




dict = c(dict, list(dict20 = dict20))



match.values = function(y, varname, dictionary = dict$dict09){
  if(is.null(dictionary[[varname]])){return(y)}
  
  idx = match(y, dictionary[[varname]]$key)
  dictionary[[varname]]$value[idx]
}


RECS09p = lapply(names(RECS09), function(z){
  too  =  match.values(RECS09[[z]], varname = z, dictionary = dict$dict09)
  too[too=='Not Applicable'] = NA
  too
})

RECS09p = as.data.table(RECS09p)
names(RECS09p) = names(RECS09)


RECS15p = lapply(names(RECS15), function(z){
  too  =  match.values(RECS15[[z]], varname = z, dictionary = dict$dict15)
  too[too=='Not Applicable'] = NA
  too
})

RECS15p = as.data.table(RECS15p)
names(RECS15p) = names(RECS15)


RECS20p = lapply(names(RECS20), function(z){
  too  =  match.values(RECS20[[z]], varname = z, dictionary = dict$dict20)
  too[too=='Not Applicable'] = NA
  too
})

RECS20p = as.data.table(RECS20p)
names(RECS20p) = names(RECS20)


RECS09p$DOEID = paste0(RECS09p$DOEID, '-09'); RECS09p$RECS = 2009
RECS15p$DOEID = paste0(RECS15p$DOEID, '-15'); RECS15p$RECS = 2015
RECS20p$DOEID = paste0(RECS20p$DOEID, '-20'); RECS20p$RECS = 2020


RECS = rbindlist(list(RECS20p, RECS15p, RECS09p), fill=T)

write_csv(RECS, file.path(DATA.PROC, 'RECS.csv'))







###
###
### Part V: CPI ####
### 
### 

CPI <- unique(bind_rows(list(blsAPI(list('seriesid'=c('CUSR0000SA0'), 'startyear'='2010', 'endyear'='2019', 'registrationKey' = blsAPIkey, api_version=2), return_data_frame=T) ,
                             blsAPI(list('seriesid'=c('CUSR0000SA0'), 'startyear'='2020', 'endyear'='2022', 'registrationKey' = blsAPIkey, api_version=2), return_data_frame=T)))) %>%
  dplyr::mutate(period = as.integer(substr(period, 2, 3)),
                ym = as.yearmon(paste0(year, '-', period)),
                yq = as.yearqtr(ym),
                CPI = as.numeric(value)) %>%
  dplyr::select(ym, yq, CPI) %>%
  dplyr::arrange(ym) %>%
  dplyr::mutate(CPI2019 = CPI / CPI[ym==as.yearmon('Jan 2019')])

write.csv(CPI, file.path(DATA.PROC, 'CPI.csv')) 
